#~/opt/mpich/bin/mpicc -O2 mpiBench.c -o mpiBench

export OMPI_MCA_btl_sm_eager_limit=32768
export OMPI_MCA_btl_sm_max_send_size=32768
export OMPI_MCA_btl_sm_use_knem=1
export OMPI_MCA_mpi_leave_pinned=1
export OMPI_MCA_btl=^openib  # Force shared memory for intra-node
export OMPI_MCA_btl_sm_use_cma=1

~/opt/hmpi/bin/mpicc -O3 mpi_collective_bench.c -o mpi_collective_bench \
    -lnuma -pthread -DUSE_NUMA -DUSE_ALIGNED_BUFFER

#ssh bms-002 "rm -rf ~/hpc-bench && mkdir -p ~/hpc-bench/mpi_bench"
ssh bms-003 "rm -rf ~/hpc-bench && mkdir -p ~/hpc-bench/mpi_bench"

#scp -r ~/hpc-bench/mpi_bench bms-002:~/hpc-bench/
scp -r ~/hpc-bench/mpi_bench bms-003:~/hpc-bench/

#export HCCL_INTER_HCCS_DISABLE=TRUE


#~/opt/hmpi/bin/mpirun -np 4 --map-by node --hostfile hostfile    \
#-x HCCL_INTER_HCCS_DISABLE=TRUE \
#-x MPI_INTER_HCCS_DISABLE=TRUE \
#--mca pml ucx --mca osc ucx --mca btl ^openib     \
#-x UCX_NET_DEVICES=roceo1:1,roceo5:1     \
#-x UCX_TLS=rc     -x UCX_IB_GID_INDEX=1     \
#-x UCX_IB_ADDR_TYPE=ib_global     -x UCX_IB_RX_QUEUE_LEN=8192     \
#-x UCX_IB_TX_QUEUE_LEN=8192     -x UCX_IB_SL=0     -x UCX_LOG_LEVEL=info     \
#~/hpc-bench/mpiBench/mpiBench -b 67108864 -e 1073741824

<<COMMENT
#test disributed by socket
# 0 <--> 2
# 1 <--> 3
cat << EOF > hostfile
bms-001
bms-001
bms-002
bms-002
EOF

~/opt/hmpi/bin/mpirun -np 4 --map-by socket --hostfile hostfile1    \
-x HCCL_INTER_HCCS_DISABLE=TRUE \
-x MPI_INTER_HCCS_DISABLE=TRUE \
--mca pml ucx --mca osc ucx --mca btl ^openib     \
-x UCX_NET_DEVICES=roceo1:1,roceo5:1     \
-x UCX_TLS=rc     -x UCX_IB_GID_INDEX=1     \
-x UCX_IB_ADDR_TYPE=ib_global     -x UCX_IB_RX_QUEUE_LEN=8192     \
-x UCX_IB_TX_QUEUE_LEN=8192     -x UCX_IB_SL=0     -x UCX_LOG_LEVEL=info     \
~/hpc-bench/mpi_bench/mpi_collective_bench -b 67108864 -e 1073741824
COMMENT



#test processes on socket0 of each node
cat << EOF > rankfile
rank 0=bms-001 slot=0-8
rank 1=bms-001 slot=0-8
rank 2=bms-001 slot=0-8
rank 3=bms-001 slot=0-8
rank 4=bms-003 slot=0-8
rank 5=bms-003 slot=0-8
rank 6=bms-003 slot=0-8
rank 7=bms-003 slot=0-8
EOF


<<COMMENT
~/opt/hmpi/bin/mpirun -np 2 \
--rankfile rankfile \
--report-bindings \
--hostfile hostfile    \
~/hpc-bench/m/mpiBench -b 33554432 -e 67108864
COMMENT

~/opt/hmpi/bin/mpirun \
--mca btl_sm_eager_limit 32768 \
--mca btl_sm_max_send_size 32768 \
--mca btl_sm_use_knem 1 \
--mca mpi_leave_pinned 1 \
-x UCX_NET_DEVICES=roceo1:1,roceo5:1     \
-x UCX_TLS=rc     -x UCX_IB_GID_INDEX=1     \
-x UCX_IB_ADDR_TYPE=ib_global     -x UCX_IB_RX_QUEUE_LEN=8192     \
-x UCX_IB_TX_QUEUE_LEN=8192     -x UCX_IB_SL=0     -x UCX_LOG_LEVEL=info     \
--bind-to core \
--report-bindings \
--rankfile rankfile \
-np 8 ./mpi_collective_bench -b 33554432 -e 67108864


<<COMMENT
#test processes on socket0 of each node
cat << EOF > rankfile
rank 0=bms-001 slot=0-8
rank 1=bms-001 slot=0-8
rank 2=bms-001 slot=80-88
rank 3=bms-001 slot=80-88
rank 4=bms-002 slot=0-8
rank 5=bms-002 slot=0-8
rank 6=bms-002 slot=80-88
rank 7=bms-002 slot=80-88
rank 8=bms-002 slot=0-8
rank 9=bms-002 slot=0-8
EOF


~/opt/hmpi/bin/mpirun -np 4 \
--rankfile rankfile \
--report-bindings \
--hostfile hostfile    \
-x HCCL_INTER_HCCS_DISABLE=TRUE \
-x MPI_INTER_HCCS_DISABLE=TRUE \
--mca pml ucx --mca osc ucx --mca btl ^openib     \
-x UCX_NET_DEVICES=roceo1:1,roceo5:1     \
-x UCX_TLS=rc     -x UCX_IB_GID_INDEX=1     \
-x UCX_IB_ADDR_TYPE=ib_global     -x UCX_IB_RX_QUEUE_LEN=8192     \
-x UCX_IB_TX_QUEUE_LEN=8192     -x UCX_IB_SL=0     -x UCX_LOG_LEVEL=info     \
~/hpc-bench/mpiBench/mpiBench -b 67108864 -e 67108864
COMMENT

#~/opt/mpich/bin/mpirun -np 3  \
#--map-by node --hostfile hostfile  ~/hpc-bench/mpiBench/mpiBench -b 67108864 -e 1073741824

#~/opt/mpich/bin/mpirun -np 2 --hostfile hostfile  ~/hpc-bench/mpiBench/mpiBench -b 67108864 -e 1073741824

#~/opt/hmpi/bin/mpirun -np 3  --map-by node --hostfile m.txt    ./mpiBench/mpiBench -b 67108864 -e 1073741824

scp  bms-002:~/hpc-bench/mpiBench/*rank*.txt .
scp  bms-003:~/hpc-bench/mpiBench/*rank*.txt .

